Trends in Data Science & Business Analytics
  • Home
  • Data Cleaning & Exploration
    • Data Cleaning
    • Exploratory Data Analysis
    • Skill Gap Analysis
  • Machine Learning Methods
    • Supervised Machine Learning
    • Unsupervised Machine Learning

Exploratory Data Analytics

Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")
Code
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)

eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
eda['DATA_ANALYST_JOB'].value_counts()
DATA_ANALYST_JOB
False    38212
True     33042
Name: count, dtype: int64
Code
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_grouped = (
    eda
    .groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
    .size()
    .reset_index(name='Job_Count')
)

short_names = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})

pivot = (
    df_grouped
    .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
    .reset_index()
)
industries = pivot['Industry'].tolist()
y_true  = pivot['True'].tolist()
y_false = pivot['False'].tolist()


# 2) Build a 2-row subplot: bar on top, table below

fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.70, 0.30],           # give a bit more room to the table
    specs=[[{"type":"bar"}],[{"type":"table"}]],
    vertical_spacing=0.12              # more space between bar and table
)

colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}

fig.add_trace(
    go.Bar(
        x=industries, y=y_true, name='True',
        marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
        text=y_true, textposition='outside'
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=industries, y=y_false, name='False',
        marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
        text=y_false, textposition='outside'
    ),
    row=1, col=1
)



# 3) Slider steps: 0 → 8 000 in 200s

steps = []
for val in range(0, 8001, 200):
    steps.append(dict(
        label=str(val),
        method="update",
        args=[
            {"y": [
                [v if v>=val else 0 for v in y_true],
                [v if v>=val else 0 for v in y_false]
            ]},
            {"title": f"Min Jobs ≥ {val:,}"}
        ]
    ))


# 4) Final layout tweaks

fig.update_layout(
    # lift slider above everything
    sliders=[dict(
        active=0,
        currentvalue={"prefix":"Min Jobs: "},
        pad={"b":0},
        x=0.05,
        y=1.05,                # move slider way above the plot area
        xanchor="left",
        yanchor="bottom",
        len=0.7,
        font=dict(color='#A81D1D'),
        steps=steps
    )],

    title=dict(
        text="Data & Business Analytics Job Trends",
        font=dict(size=24, color='#A81D1D'),
        x=0.5,
        y=0.95,                # drop the title just below the slider
        xanchor="center",
        yanchor="top"
    ),

    width=1100, height=850,
    margin=dict(l=60, r=60, t=180, b=200),  # extra top & bottom margin

    plot_bgcolor='white',
    paper_bgcolor='white',

    xaxis=dict(
        title="Industry",
        title_font=dict(size=16, color='#A81D1D'),
        tickmode='array',
        tickvals=list(range(len(industries))),
        ticktext=industries,
        tickangle=-30,
        tickfont=dict(size=11, color='#333'),
        showline=True, linecolor='#A81D1D'
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16, color='#A81D1D'),
        tickfont=dict(size=11, color='#333'),
        gridcolor='rgba(200,200,200,0.3)',
        showline=True, linecolor='#A81D1D',
        range=[0, max(max(y_true),max(y_false))*1.2]
    ),

    legend=dict(
        title="Data Analyst Job",
        title_font=dict(color='#A81D1D'),
        font=dict(size=12),
        x=0.95, y=0.95
    ),

    bargap=0.2
)
fig.show()
fig.write_html(
    "figures/edaplot1.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 2: Clean the data (remove rows with missing SPECIALIZED_SKILLS_NAME)
df = df.dropna(subset=['SPECIALIZED_SKILLS_NAME'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Split the SPECIALIZED_SKILLS_NAME into individual skills
# Assuming SPECIALIZED_SKILLS_NAME is a string of skills separated by commas or another delimiter
df_skills = df.copy()
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.split(',')  # Adjust delimiter if needed
df_skills = df_skills.explode('SPECIALIZED_SKILLS_NAME')
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.strip()

# Step 4: Group by skill and Job_Category to get the count
df_skills_count = df_skills.groupby(['SPECIALIZED_SKILLS_NAME', 'Job_Category']).size().reset_index(name='Count')

# Step 5: Get the top 10 skills by total count
top_skills = df_skills_count.groupby('SPECIALIZED_SKILLS_NAME')['Count'].sum().nlargest(10).index
df_skills_top = df_skills_count[df_skills_count['SPECIALIZED_SKILLS_NAME'].isin(top_skills)]

# Debug: Check the grouped data
print("Top 10 specialized skills:")
print(df_skills_top)

# Step 6: Create the bar plot
fig = px.bar(
    df_skills_top,
    x='Count',
    y='SPECIALIZED_SKILLS_NAME',
    color='Job_Category',
    barmode='stack',
    color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
    title='Top 10 Specialized Skills by Job Category'
)

# Step 7: Update layout for styling
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    xaxis=dict(
        title='Number of Jobs',
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='#E2E8F0',
        linecolor='#2D3748',
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title='Specialized Skill',
        title_font=dict(size=16),
        tickfont=dict(size=12)
    ),
    legend=dict(
        title='Job Category',
        font=dict(size=13),
        bgcolor='#FFFFFF',
        bordercolor='#FF6B6B',
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor='left',
        yanchor='middle'
    )
)

fig.show()

# Save to HTML
fig.write_html(
    'figures/edaplot2.html',
    include_plotlyjs='cdn',
    full_html=False
)
Number of rows after cleaning: 71254
Top 10 specialized skills:
               SPECIALIZED_SKILLS_NAME       Job_Category  Count
2071           "Business Intelligence"      Analytics Job   8077
2072           "Business Intelligence"  Non-Analytics Job   1778
2167                "Business Process"      Analytics Job   4417
2168                "Business Process"  Non-Analytics Job   8385
2180           "Business Requirements"      Analytics Job   4972
2181           "Business Requirements"  Non-Analytics Job   7740
4151                       "Dashboard"      Analytics Job   9975
4152                       "Dashboard"  Non-Analytics Job   1559
4173                   "Data Analysis"      Analytics Job  25620
6640                         "Finance"      Analytics Job   5573
6641                         "Finance"  Non-Analytics Job   6220
13332             "Project Management"      Analytics Job   6236
13333             "Project Management"  Non-Analytics Job   6939
13594  "Python (Programming Language)"      Analytics Job  10127
13595  "Python (Programming Language)"  Non-Analytics Job   1703
14459               "SAP Applications"      Analytics Job   2174
14460               "SAP Applications"  Non-Analytics Job   9149
14686     "SQL (Programming Language)"      Analytics Job  16067
14687     "SQL (Programming Language)"  Non-Analytics Job   4202
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Step 2: Clean the data (remove rows with missing values for the variables we need)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'REMOTE_TYPE_NAME', 'MIN_EDULEVELS_NAME'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 4: Encode categorical variables numerically for the parallel coordinates plot
# Encode REMOTE_TYPE_NAME (e.g., Remote=2, Hybrid=1, On-site=0)
remote_mapping = {'Remote': 2, 'Hybrid': 1, 'On-site': 0}  # Adjust based on your unique values
df['Remote_Type_Code'] = df['REMOTE_TYPE_NAME'].map(remote_mapping)

# Encode MIN_EDULEVELS_NAME (e.g., High School=0, Bachelor's=1, Master's=2, PhD=3)
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3}  # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)

# Step 5: Clean again after encoding (in case some mappings failed)
df = df.dropna(subset=['Remote_Type_Code', 'Edu_Level_Code'])

# Debug: Check the encoded data
print("Sample encoded data:")
print(df[['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code', 'Job_Category']].head())

# Step 6: Create the parallel coordinates plot
fig = px.parallel_coordinates(
    df,
    dimensions=['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code'],
    color_continuous_scale=['#4ECDC4', '#FF6B6B'],  # Gradient from teal to red
    color=df['Job_Category'].map({'Analytics Job': 1, 'Non-Analytics Job': 0}),  # Color by Job_Category
    labels={
        'SALARY': 'Salary ($)',
        'Avg_Years_Experience': 'Avg. Years Experience',
        'Remote_Type_Code': 'Remote Type (0=On-site, 1=Hybrid, 2=Remote)',
        'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
    },
    title='Multidimensional Comparison of Job Attributes by Job Category'
)

# Step 7: Update layout for styling
fig.update_layout(
    width=1000,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    )
)

fig.show()

fig.write_html(
    "figures/edaplot3.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Number of rows after cleaning: 71254
Sample encoded data:
Empty DataFrame
Columns: [SALARY, Avg_Years_Experience, Remote_Type_Code, Edu_Level_Code, Job_Category]
Index: []
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])

# Create the scatter plot with trend line
fig = px.scatter(df, 
                 x='Avg_Years_Experience', 
                 y='SALARY', 
                 color='Job_Category',
                 trendline='ols',  # Add trend line (ordinary least squares)
                 title='Experience Requirements vs Salary for Analytics Jobs',
                 labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
                 color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Average Years of Experience",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Customize scatter points
fig.update_traces(
    marker=dict(
        size=8,
        opacity=0.7,
        line=dict(width=1, color="#2D3748")
    )
)
fig.show()

fig.write_html(
    "figures/edaplot4.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Step 2: Clean the data (remove rows with missing values)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'MIN_EDULEVELS_NAME', 'POSTED'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 4: Encode MIN_EDULEVELS_NAME numerically
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3}  # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)

# Step 5: Clean again after encoding
df = df.dropna(subset=['Edu_Level_Code'])

# Step 6: Extract a time component from POSTED (e.g., year or year-month)
df['POSTED'] = pd.to_datetime(df['POSTED'])  # Ensure POSTED is in datetime format
df['Year_Month'] = df['POSTED'].dt.to_period('M').astype(str)  # Extract year-month for animation

# Debug: Check the time range
print("Time range for animation:", df['Year_Month'].unique())

# Step 7: Create the 3D scatter plot with animation
fig = px.scatter_3d(
    df,
    x='Avg_Years_Experience',
    y='SALARY',
    z='Edu_Level_Code',
    color='Job_Category',
    color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
    animation_frame='Year_Month',  # Animate over time
    title='Salary, Experience, and Education Over Time by Job Category',
    labels={
        'Avg_Years_Experience': 'Avg. Years Experience',
        'SALARY': 'Salary ($)',
        'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
    },
    hover_data=['Job_Category', 'Year_Month']
)

# Step 8: Update layout for styling
fig.update_layout(
    width=900,
    height=600,
    scene=dict(
        xaxis_title='Avg. Years Experience',
        yaxis_title='Salary ($)',
        zaxis_title='Min. Education Level',
        xaxis=dict(tickfont=dict(size=12)),
        yaxis=dict(tickfont=dict(size=12)),
        zaxis=dict(tickfont=dict(size=12))
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    legend=dict(
        title='Job Category',
        font=dict(size=13),
        bgcolor='#FFFFFF',
        bordercolor='#FF6B6B',
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor='left',
        yanchor='middle'
    )
)

fig.show()

# Save the file
fig.write_html(
    "figures/edaplot5.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Number of rows after cleaning: 71254
Time range for animation: []